package data_deepprocessing.algorithm.ssvm.ssvm_util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.deeplearning4j.models.word2vec.Word2Vec;

import data_deepprocessing.algorithm.crfs.CreateTrainDataSet2En;
import data_deepprocessing.algorithm.ssvm.bean.WordMarkBean;
import data_deepprocessing.algorithm.word_embedding.service.YYH_Word2Vector_EN_Service;
import data_deepprocessing.prepareData.beans.XianBingShi_new_zhangBean;
import data_deepprocessing.prepareData.db.InitSeedDB;
import data_deepprocessing.prepareData.db.XianBingShi_New_zhangDB;
import data_deepprocessing.util.FileUtil;

/** 
* @author  作者 : YUHU YUAN
* @date 创建时间：2017年3月31日 下午10:27:31 
* @version 1.0  
* 1:得到List<XianBingSHiBean> list；
* 2:得到分词后的结果；
* 3：通过word embedding 的到vector和 model文件
* 4：利用list中的信息何vector和model文件生成 ssvm要用的配置文件
* 5：制作训练集和测试集的数据
* 6：调用算法准备开始试验。
*	现在试验设计和实验方法部分就好写了吧，这部分的内容好好的组织一下 
*   
*	注释：XBS均指XianBingShi
*/

public class SsvmService2CrossValidService2EN {
	
	private InitSeedDB initSeedDB;
	private XianBingShi_New_zhangDB xianBingShi_New_zhangDB;
	private CreateTrainDataSet2En createTrainDataSet2En;
	

	private List<XianBingShi_new_zhangBean> doGetSegmentedXianBingShi(){
		return xianBingShi_New_zhangDB.selectTop1200XianBingShi2En();
	}
	
	private Word2Vec doGetWord2Vec(){
		
		try {
			return YYH_Word2Vector_EN_Service.getWord2Vec();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return null;
	}
	
	private List<String> doGetSymptom_Dictionary(){
		return initSeedDB.selectAllSeedContent2En();
	}
	
	private void divideDataset(){
		List<XianBingShi_new_zhangBean> beanList = doGetSegmentedXianBingShi();
		while (!beanList.isEmpty()) {
			int size = beanList.size();
			int index = (int) (Math.random() * size);
			XianBingShi_new_zhangBean tmp = beanList.get(index);
			divided_dataset.get(size % 10).add(tmp);
			beanList.remove(index);
		}
		
	}
	
	
	private Map<Integer, List<XianBingShi_new_zhangBean>> divided_dataset = initMap();
	
	private Map<Integer, List<XianBingShi_new_zhangBean>> initMap(){
		Map<Integer, List<XianBingShi_new_zhangBean>> divided_dataset = new HashMap<>();
		for (int i = 0; i < 10; i++) {
			List<XianBingShi_new_zhangBean> fold = new ArrayList<XianBingShi_new_zhangBean>();
			divided_dataset.put(i, fold);
		}
		return divided_dataset;
	}
	
	
	
	/**
	 * 生成相应的训练集主要是为了得到前面的标签，后面的程序看看怎么改
	* @author  作者 : YUHU YUAN
	* @date 创建时间：2017年5月1日 下午9:55:53 
	* @parameter 
	* @return
	* @throws
	 */
	public void doGenerateCRFCrossValidDataSet() {
		doCreateAllCRFData();
		divideDataset();
		StringBuffer inputPath = new StringBuffer(path).append("alldataCRF");
		File[] files = new File(inputPath.toString()).listFiles();
		
		for (int i = 0; i < divided_dataset.size(); ++i) {
			StringBuffer testPath = new StringBuffer(path).append("dataCRF").append(i).append(File.separator)
					.append("test");
			
			List<XianBingShi_new_zhangBean> testDataSet = divided_dataset.get(i);
			try {
				createTrainDataSet2En.doCreateENCRFData(testPath.toString(), files,testDataSet);
			} catch (Exception e1) {
				// TODO Auto-generated catch block
				e1.printStackTrace();
			}
			StringBuffer trainPath = new StringBuffer(path).append("dataCRF").append(i).append(File.separator)
					.append("train");
			List<XianBingShi_new_zhangBean> trainDataSet = new ArrayList<>();
			for (int j = 0; j < divided_dataset.size(); ++j) {
				if (j != i) {
					trainDataSet.addAll(divided_dataset.get(j));
				}
			}
			try {
				createTrainDataSet2En.doCreateENCRFData(trainPath.toString(), files,trainDataSet);
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			testPath.setLength(0);
			trainPath.setLength(0);
		}

	}
	
	/**
	 * 思路 ，先生成数据，然后往不同的目录下拷贝
	* @author  作者 : YUHU YUAN
	* @date 创建时间：2017年5月1日 下午11:26:32 
	* @parameter 
	* @return
	* @throws
	 */
	private void doCreateAllCRFData(){
		List<String> initSeeds = doGetSymptom_Dictionary();
		List<XianBingShi_new_zhangBean> allDataSet = doGetSegmentedXianBingShi();
		StringBuffer outputPath = new StringBuffer(path).append("alldataCRF");
		createTrainDataSet2En.doCreateENCRFData(outputPath.toString(), initSeeds, allDataSet);
	}
	
	
	
	
	
	/** 
	* @author  作者 : YUHU YUAN
	* @date 创建时间：2017年4月9日 下午10:37:03 
	* @parameter 
	* @return
	* @throws
	*/
	private void doGenerateSsvmDataSet2Train(String outputPath, String inputPath,Word2Vec word2Vec) throws IOException{
		BufferedWriter writer = FileUtil.getWriter(outputPath);
		File[] crfFiles = new File(inputPath).listFiles();
		List<WordMarkBean> wordMarkBeans = null;
		BufferedReader reader = null;
		int qid = 1;// SVM程序要求必须从1开始递增
		for (File file : crfFiles) {
			reader = FileUtil.getReader(file);
			wordMarkBeans = new ArrayList<>();
			String line = "";
			while ((line = reader.readLine()) != null) {
				String[] temp = line.split("\t");
				WordMarkBean bean = new WordMarkBean(temp[0], temp[1]);
				wordMarkBeans.add(bean);
			}

			StringBuffer constructTerm = new StringBuffer();
			for (WordMarkBean bean : wordMarkBeans) {
				String word = bean.getWord().toLowerCase();
				double[] voctor = word2Vec.getWordVector(word);
				String mark = bean.getMark();
				String tag = "";
				if (mark.equals("O")) {
					tag = "1";
				} else if (mark.equals("B-S")) {
					tag = "2";
				} else if (mark.equals("E-S")) {
					tag = "3";
				}
				constructTerm.append(tag).append(" ");
				constructTerm.append("qid:").append(qid).append(" ");
				if (voctor != null) {
					for (int i = 0; i < voctor.length; ++i) {
						constructTerm.append((i + 1) + ":" + voctor[i]).append(" ");
					}
				}else {
					for(int i=0; i<200; ++i){
						constructTerm.append((i+1)+":"+1).append("\t");
					}
				}
				constructTerm.append("#").append(word);
				writer.write(constructTerm.toString());
				writer.flush();
				writer.newLine();
				constructTerm.setLength(0);
			}
			++qid;
		}
		if (reader != null) {
			reader.close();
		}
		if (writer != null) {
			writer.close();
		}
		
	}
	
	private void doGenerateSsvmDataSet2Test(String outputPath,String outputPath2Judge,String inputPath,Word2Vec word2Vec) throws IOException{
		BufferedWriter writer = FileUtil.getWriter(outputPath);
		BufferedWriter writer2judge = FileUtil.getWriter(outputPath2Judge);
		StringBuffer constructTerm = new StringBuffer();
		StringBuffer constructTerm2judge = new StringBuffer();
		File[] crfFiles = new File(inputPath).listFiles();
		List<WordMarkBean> wordMarkBeans = null;
		BufferedReader reader = null;
		int qid = 1; // 这里主要是SVM程序要求必须从1开始，这样就必须自己设计
		for (File file : crfFiles) {
			reader = FileUtil.getReader(file);
			wordMarkBeans = new ArrayList<>();
			String line = "";
			while ((line = reader.readLine()) != null) {
				String[] temp = line.split("\t");
				WordMarkBean bean = new WordMarkBean(temp[0], temp[1]);
				wordMarkBeans.add(bean);
			}
			for (WordMarkBean bean : wordMarkBeans) {
				String word = bean.getWord().toLowerCase();
				double[] voctor = word2Vec.getWordVector(word);
				String mark = bean.getMark();
				String tag = "";
				if (mark.equals("O")) {
					tag = "1";
				} else if (mark.equals("B-S")) {
					tag = "2";
				} else if (mark.equals("E-S")) {
					tag = "3";
				}
				constructTerm.append(tag).append(" ");
				constructTerm.append("qid:").append(qid).append(" ");
				constructTerm2judge.append(tag).append(" ");
				constructTerm2judge.append("qid:").append(file.getName()).append(" ").append(qid).append(" ");
				if (voctor != null) {

					for (int i = 0; i < voctor.length; ++i) {
						constructTerm.append((i + 1) + ":" + voctor[i]).append(" ");
					}
				}else {
					for(int i=0; i<200; ++i){
						constructTerm.append((i+1)+":"+1).append("\t");
					}
				}
				constructTerm.append("#").append(bean.getWord());
				constructTerm2judge.append("#").append(bean.getWord());
				writer.write(constructTerm.toString());
				writer.flush();
				writer.newLine();
				writer2judge.write(constructTerm2judge.toString());
				writer2judge.flush();
				writer2judge.newLine();
				constructTerm.setLength(0);
				constructTerm2judge.setLength(0);
			}
			++qid;
		}
		if (writer != null) {
			writer.close();
		}
		if (writer2judge != null) {
			writer2judge.close();
		}
		
	}
	
	
	private static String path = "D:\\yyh_yuanyuhu_graduation_experimental\\word_embedding_EN_SSVM\\dataset\\";
	
	public void doGenerateSsvmCrossValidDataSet(){
		
		Word2Vec word2Vec = doGetWord2Vec();
		
		for(int i=0 ; i<divided_dataset.size(); ++i){
			StringBuffer testPath = new StringBuffer(path).append("data")
					.append(i).append(File.separator).append("test")
					.append(File.separator).append("test.txt");
			StringBuffer testPath2Judge = new StringBuffer(path).append("data")
					.append(i).append(File.separator).append("test")
					.append(File.separator).append("test2Judge.txt");
			StringBuffer crfTestPath = new StringBuffer(path).append("dataCRF").append(i).append(File.separator).append("test");
			try {
				doGenerateSsvmDataSet2Test(testPath.toString(),testPath2Judge.toString(),crfTestPath.toString(), word2Vec);
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			StringBuffer trainPath = new StringBuffer(path).append("data")
					.append(i).append(File.separator).append("train")
					.append(File.separator).append("train.dat");
			StringBuffer crfTrainPath = new StringBuffer(path).append("dataCRF").append(i).append(File.separator).append("train");
			try {
				doGenerateSsvmDataSet2Train(trainPath.toString(),crfTrainPath.toString(), word2Vec);
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			testPath.setLength(0);
			testPath2Judge.setLength(0);
			trainPath.setLength(0);
			crfTestPath.setLength(0);
			crfTrainPath.setLength(0);

			
		}
		
	}
	
	
	public void function(){
		//生成CRF类型的文档，这一块还是很费时间的
//		doGenerateCRFCrossValidDataSet();
		doGenerateSsvmCrossValidDataSet();
	}
	

	public InitSeedDB getInitSeedDB() {
		return initSeedDB;
	}

	public void setInitSeedDB(InitSeedDB initSeedDB) {
		this.initSeedDB = initSeedDB;
	}

	public XianBingShi_New_zhangDB getXianBingShi_New_zhangDB() {
		return xianBingShi_New_zhangDB;
	}

	public void setXianBingShi_New_zhangDB(XianBingShi_New_zhangDB xianBingShi_New_zhangDB) {
		this.xianBingShi_New_zhangDB = xianBingShi_New_zhangDB;
	}

	public CreateTrainDataSet2En getCreateTrainDataSet2En() {
		return createTrainDataSet2En;
	}

	public void setCreateTrainDataSet2En(CreateTrainDataSet2En createTrainDataSet2En) {
		this.createTrainDataSet2En = createTrainDataSet2En;
	}

	public Map<Integer, List<XianBingShi_new_zhangBean>> getDivided_dataset() {
		return divided_dataset;
	}

	public void setDivided_dataset(Map<Integer, List<XianBingShi_new_zhangBean>> divided_dataset) {
		this.divided_dataset = divided_dataset;
	}
	
	
	
	
}















